#!/usr/bin/env expect
#############################################################################
# Purpose: Stress test of per-task output and input files
#
# Note:    To avoid one minute NFS propagation delays in the output files,
#          we create them ahead of time. Without explicity file creation,
#          this test requires about one minute per cycle to execute.
#
# Note:    This script generates and then deletes files in the working
#          directory named test9.4.input, test9.4.[0-9]+.input, and
#          test9.4.[0-9]+.output
############################################################################
# Copyright (C) 2002-2007 The Regents of the University of California.
# Copyright (C) 2008-2010 Lawrence Livermore National Security.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Morris Jette <jette1@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
############################################################################
source ./globals

set file_in       "$test_dir/input"
set file_in_task  "$test_name.%t.input"
set file_out_task "$test_name.%t.output"
set job_name      $test_name
set cycle_count   [get_cycle_count]
set task_cnt      $max_stress_tasks

# TODO: Temporary debug for bug 15302 (remove once fixed)
set config_dir  [get_conf_path]
set config_file $config_dir/slurm.conf


proc cleanup {} {
	global test_name task_cnt

	# Destroy all input/output files
	for {set tsk 0} {$tsk < $task_cnt} {incr tsk} {
		set file_in_glob  "$test_name.$tsk.input"
		set file_out_glob "$test_name.$tsk.output"
		file delete $file_in_glob $file_out_glob
	}

	# TODO: Temporary debug for bug 15302 (remove once fixed)
	global config_file
	restore_conf $config_file
	reconfigure
}


if {[get_config_param "LaunchType"] ne "launch/slurm"} {
	skip "This test is only compatible with systems using launch/slurm"
} else {
	set node_cnt 1-4
}
set other_opts    "-O"


# Execute an srun job to print hostname to output_file with task_cnt tasks
# per node, wait for completion
# Returns 0 on successful completion, returns 1 otherwise
proc run_cat_job { input_file output_file } {
	global bin_cat bin_rm job_name number srun node_cnt other_opts task_cnt timeout

	# TODO: Temporarily add verbosity to troubleshoot bug 15302 (restore once fixed)
	spawn $srun -vvvvv --job-name=$job_name -e - -i $input_file -o $output_file -N$node_cnt -n$task_cnt $other_opts -t1 $bin_cat -
	#spawn $srun --job-name=$job_name -e - -i $input_file -o $output_file -N$node_cnt -n$task_cnt $other_opts -t1 $bin_cat -
	expect {
		-re "Unable to contact" {
			fail "Slurm appears to be down"
		}
		timeout {
			fail "srun not responding"
		}
		eof {
			wait
		}
	}
}


# TODO: Temporary debug for bug 15302 (remove once fixed)
save_conf $config_file
run_command -none "$bin_echo SlurmdDebug=debug4 >> $config_file"
reconfigure -fail

#
# Create a sizable text file
#
exec $bin_cat /etc/hosts >$file_in
exec $bin_cat /etc/passwd >>$file_in
set stdin_lines [get_line_cnt $file_in]

# Make a text file for each task
set timeout $max_job_delay
spawn $srun -e /dev/null -i $file_in -o $file_in_task -N$node_cnt -n$task_cnt $other_opts -t1 $bin_cat
expect {
	-re "Unable to contact" {
		fail "Slurm appears to be down"
	}
	timeout {
		fail "srun not responding"
	}
	eof {
		wait
	}
}

#
# Run cycle_count jobs to copy job input to job output and compare sizes
#
set success_cnt 0
set timeout $max_job_delay
for {set inx 0} {$inx < $cycle_count} {incr inx} {
	for {set tsk 0} {$tsk < $task_cnt} {incr tsk} {
		set file_out_glob "$test_name.$tsk.output"
		exec $bin_rm -f $file_out_glob
	}
	run_cat_job $file_in_task $file_out_task
	# Test output file sizes
	for {set tsk 0} {$tsk < $task_cnt} {incr tsk} {
		set file_out_glob "$test_name.$tsk.output"
		wait_for_file $file_out_glob
		set stdout_lines [get_line_cnt $file_out_glob]
		if {$stdout_lines != $stdin_lines} {
			exec $bin_sleep 1
			set stdout_lines [get_line_cnt $file_out_glob]
		}
		if {$stdout_lines != $stdin_lines} {
			# TODO: Temporary debug for bug 15302 (remove once fixed)
			set hostname [string trimright [run_command_output "hostname -s"]]
			run_command "sudo dmesg -T | grep -i 'killed process'"
			run_command "free -h"
			run_command "top -o %MEM -n 1"

			if {$stdout_lines == 0} {
				fail "stdout is empty. Is current working directory writable from compute nodes?"
			} else {
				fail "stdout is incomplete"
			}
		} else {
			incr success_cnt
		}
	}
}
